In [1]:
import pandas as pd
import numpy as np

Getting the CIOCS Data Ready


In [2]:
# Read newest CIOCS data
ciocs = pd.read_csv('https://docs.google.com/spreadsheets/d/1pE5kECEnegc6qbkntT-WnuJSP5UhAEHvaSdBy4KrwFo/export?format=csv',
            header=5,
            na_values=['NA',''],
            parse_dates=[10,11],
            names=['City','Week','Tests','Reagent','Condoms','Age','Men','Origin','Country','Skin','StartDate','EndDate','Note'])
ciocs.head()


Out[2]:
City Week Tests Reagent Condoms Age Men Origin Country Skin StartDate EndDate Note
0 Belo Horizonte 1 NaN NaN NaN NaN NaN NaN NaN NaN 2014-06-12 2014-06-15 not a full week
1 Belo Horizonte 2 NaN NaN NaN NaN NaN NaN NaN NaN 2014-06-16 2014-06-22 NaN
2 Belo Horizonte 3 NaN NaN NaN NaN NaN NaN NaN NaN 2014-06-23 2014-06-29 NaN
3 Belo Horizonte 4 NaN NaN NaN NaN NaN NaN NaN NaN 2014-06-30 2014-07-06 NaN
4 Belo Horizonte 5 NaN NaN NaN NaN NaN NaN NaN NaN 2014-07-07 2014-07-13 NaN

In [3]:
ciocs.dtypes #Checking how pandas reads the different columns


Out[3]:
City                 object
Week                  int64
Tests               float64
Reagent             float64
Condoms             float64
Age                  object
Men                 float64
Origin               object
Country              object
Skin                 object
StartDate    datetime64[ns]
EndDate      datetime64[ns]
Note                 object
dtype: object

In [4]:
ciocs.describe()


Out[4]:
Week Tests Reagent Condoms Men
count 60.000000 34.000000 34.000000 7.000000 33.000000
mean 3.000000 125.411765 2.705882 30781.571429 60.003030
std 1.426148 153.538095 4.522867 26659.579041 10.135658
min 1.000000 11.000000 0.000000 179.000000 35.100000
25% 2.000000 31.250000 0.000000 15450.000000 52.800000
50% 3.000000 57.000000 1.000000 24192.000000 61.600000
75% 4.000000 127.000000 3.000000 40000.000000 66.700000
max 5.000000 563.000000 17.000000 80200.000000 78.000000

Number of Tests


In [5]:
ciocs['Tests'].describe()


Out[5]:
count     34.000000
mean     125.411765
std      153.538095
min       11.000000
25%       31.250000
50%       57.000000
75%      127.000000
max      563.000000
Name: Tests, dtype: float64

In [6]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [7]:
from ggplot import *
%matplotlib inline

ggplot(aes(x='City'), data=ciocs) + geom_bar(stat = 'identity', y='Tests')


Out[7]:
<repr(<ggplot.ggplot.ggplot at 0x7f04a1e00150>) failed: AssertionError: incompatible sizes: argument 'height' must be length 60 or scalar>

In [8]:
ggplot(aes(x='StartDate', y='Tests', colour='City'), data=ciocs) +\
    geom_line() +\
    stat_smooth() +\
    scale_color_brewer(type='qual', palette='Paired') # Qual:'Accent', 'Dark2', 'Paired', 'Pastel1', 'Pastel2', 
                                                      # 'Set1', 'Set2', 'Set3'


Out[8]:
<repr(<ggplot.ggplot.ggplot at 0x7f04b33e3d10>) failed: KeyError: 0>

More Tests, More Reagent Tests?


In [9]:
ggplot(aes(x='Tests', y='Reagent'), data=ciocs) +\
    geom_point() +\
    stat_smooth(method='lm',colour='#00aeef',fill='#00aeef') +\
    theme_seaborn()


Out[9]:
<repr(<ggplot.ggplot.ggplot at 0x7f04a203db90>) failed: KeyError: 0>

In [10]:
ggplot(aes(x='Tests', y='Reagent'), data=ciocs) +\
    geom_point() +\
    stat_smooth(span=0.6,colour='#00aeef',fill='#00aeef') +\
    theme_bw()


Out[10]:
<ggplot: (8755999879353)>

In [10]:
ggplot(aes(x='Tests', y='Reagent'), data=ciocs) +\
    geom_point() +\
    stat_smooth(method='lm',colour='#00aeef',fill='#00aeef') # Using linear smooth


Out[10]:
<repr(<ggplot.ggplot.ggplot at 0x7f04a182e490>) failed: KeyError: 0>

In [18]:
from bokeh.plotting import *
%matplotlib inline
output_notebook()


BokehJS successfully loaded.

In [25]:
figure(
    title='Number of Tests',        # Plot title
    title_text_font='Courier New',  # Title font
    title_text_color='#5d6263',     # Title font colour
    plot_width=1000,                # Plot width
    plot_height=600,                # Plot height
    background_fill='#f6f6f6',      # Background colour
    border_fill='#f6f6f6',          # Border background
    v_symmetry=True,                
    h_symmetry=True,                
    outline_line_color='#f6f6f6',   # Plot area border colour
    x_axis_type = 'datetime',       # For timeseries only
    #tools='pan,box_zoom,previewsave,resize,select,reset' # Available: pan,wheel_zoom,box_zoom,previewsave,resize,select,reset
)
hold()
line(
    ciocs['StartDate'],             # x
    ciocs['Tests'],                 # y
    color='#00aeef',                # Line colour
    line_width=3,                   # Line width in px
    legend='Tests',                 # Legend label
)
legend().label_text_font='Courier New'
legend().label_text_color='#5d6263'
legend().border_line_color='#f6f6f6'
yaxis().axis_line_color = None
xaxis().axis_line_color = '#d4d4d4'
axis().major_label_text_font="Courier New"
axis().major_label_text_font_size="12pt"
xgrid().grid_line_color = None
ygrid().grid_line_color = "#d4d4d4"
ygrid().grid_line_width = 1
show()


Number of Reagent Tests


In [35]:
ggplot(aes(x='Week', y='Reagent', colour='City'), data=ciocs) + geom_line() + stat_smooth()


Out[35]:
<repr(<ggplot.ggplot.ggplot at 0x7f04a2069490>) failed: KeyError: 0>

In [27]:
ggplot(aes(x='Week', y='Reagent', colour='City'), data=ciocs) +\
    stat_smooth(se=False, size=8) + \
    geom_point(alpha=0.8) # Drop the confidence interval and make the line itself broad and make the dots transparent


Out[27]:
<repr(<ggplot.ggplot.ggplot at 0x7f04a15f8090>) failed: KeyError: 0>

In [28]:
ggplot(aes(x='Week', y='Reagent'), data=ciocs) +\
    geom_jitter() +\
    stat_smooth(se=False, size=8, colour='#00aeef') +\
    facet_wrap('City')


Out[28]:
<repr(<ggplot.ggplot.ggplot at 0x7f04a2078690>) failed: IndexError: index out of bounds>

In [29]:
figure()
hold()
line(ciocs['StartDate'], ciocs['Reagent'], color='#00447c', legend='Reagent Tests', x_axis_type = 'datetime', 
     tools='pan,box_zoom,previewsave,resize,select,reset')
show()


Number of Condoms Distributed


In [30]:
ggplot(aes(x='Week', y='Condoms', colour='City'), data=ciocs) +\
    geom_line() +\
    stat_smooth()


Out[30]:
<repr(<ggplot.ggplot.ggplot at 0x7f049d2af1d0>) failed: KeyError: 0>

In [18]:
figure()
hold()
line(ciocs['StartDate'], ciocs['Condoms'], color='#cf5c42', line_width=3, legend='Condoms Distributed', x_axis_type = 'datetime', 
     tools='pan,box_zoom,previewsave,resize,select,reset')
show()


Bokeh Plot
Plots

Getting the Twitter Data Ready


In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)


Out[1]:

In [ ]: